# -*- coding: utf-8 -*-
import scrapy
import requests
from bs4 import BeautifulSoup
from scrapy import Request
import math
from scrapylijia.items import lianjiaItem

class LijiaSpider(scrapy.Spider):
    name = 'lijia'
    allowed_domains = ['tj.lianjia.com']
    start_urls = ['https://tj.lianjia.com/zufang/rt200600000001/?showMore=1']

    # 初始链接为start_urls,返回各个行政区的下辖租房页面，
    # 类似于 https://tj.lianjia.com/zufang/heping/rt200600000001/?showMore=1(和平区)
    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        areas = soup.find_all(name='div', class_='filter__wrapper w1150', id='filter')
        for area in areas:
            for urls in area.find_all(name='ul', class_=''):
                for url in urls.find_all(name='li', class_='filter__item--level2 '):
                    lj = 'https://tj.lianjia.com' + url.a['href'] + '?showMore=1'
                    yield Request(url = lj,callback=self.parse_index)

    # 以https://tj.lianjia.com/zufang/heping/rt200600000001/?showMore=1 为开始，
    # 返回链接形式为 https://tj.lianjia.com/zufang/TJ2194894014569267200.html?nav=200600000001
    def parse_index(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        rooms = soup.find_all(name='div', class_='content__list')
        for room in rooms:
            for r in room.find_all(name='a', class_='content__list--item--aside'):
                if 'apartment' in r['href']:  # 带有apartment字符串的是公寓，提供的数据很少，暂时不做解析
                    continue
                else:
                    url_base = 'https://tj.lianjia.com'
                    url = url_base + r['href']
                    if url == 'https://m.lianjia.com/chuzu/tj/zufang/TJ2199019799537778688.html':
                        continue
                    yield Request(url=url,callback=self.parse_detail)

    def parse_detail(self,response):
        soup = BeautifulSoup(response.text, 'lxml')
        item = lianjiaItem()
        title = soup.head.title.text
        item['title'] = title   #房租标题
        url = response.url

        for hxs in soup.find_all(name='p', class_='content__article__table'):
            item['zlfs'] = hxs.find_all(name='span')[0].text
            item['hx'] = hxs.find_all(name='span')[1].text
            item['area'] = hxs.find_all(name='span')[2].text
            item['cx'] = hxs.find_all(name='span')[3].text

        price = soup.find_all(name='p', class_='content__aside--title')[0].text
        item['price'] = price

        for flags in soup.find_all(name='p', class_='content__aside--tags'):
            item['flag'] = flags.text.strip().replace('\n', '|')

        for base_infos in soup.find_all(name='div', class_='content__article__info'):
            for base in base_infos(name='li'):
                if '楼层' in base.text:
                    item['floor'] = base.text.replace('楼层：', '')
                if '用电' in base.text:
                    item['dian'] = base.text.replace('用电：', '')
                if '用水' in base.text:
                    item['shui'] = base.text.replace('用水：', '')
                if '电梯' in base.text:
                    item['dt'] = base.text.replace('电梯：', '')
                if '燃气' in base.text:
                    item['rq'] = base.text.replace('燃气：', '')
                if '车位' in base.text:
                    item['cw'] = base.text.replace('车位：', '')

        item['url'] = url
        for jts in soup.find_all(name='div', class_='content__article__info4', id='around'):
            for jt in jts(name='li'):
                item['jt'] = jt.text.replace('\n', '').replace(' ', '')

        yield item
